Shanshan Wang
Instructor: Ari Anastassiou
Feb.17, 2021
We have taxi rank locations, and want to define key clusters of these taxis where we can build service stations for all taxis operating in that region.
Task 1: Exploratory Data Analysis
Task 2: Visualizing Geographical Data
Task 3: Clustering Strength / Performance Metric
Task 4: K-Means Clustering
Task 5: DBSCAN
Task 6: HDBSCAN
Task 7: Addressing Outliers
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
#!pip install numpy==1.20.0
import numpy as np
from tqdm import tqdm
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier
from ipywidgets import interactive
from collections import defaultdict
#!pip install hdbscan
import hdbscan
#!pip install folium
import folium
import re
cols = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
'#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
'#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
'#000075', '#808080']*10
df=pd.read_csv('taxi_data.csv')
df.head()
| LON | LAT | NAME | |
|---|---|---|---|
| 0 | 28.17858 | -25.73882 | 11th Street Taxi Rank |
| 1 | 28.17660 | -25.73795 | 81 Bazaar Street Taxi Rank |
| 2 | 27.83239 | -26.53722 | Adams Road Taxi Rank |
| 3 | 28.12514 | -26.26666 | Alberton City Mall Taxi Rank |
| 4 | 28.10144 | -26.10567 | Alexandra Main Taxi Rank |
df.duplicated(subset=['LON','LAT']).values.any()
True
df.isna().values.any()
True
print(f'Before dropping NaNs and dupes\t:\tdf.shape = {df.shape}')
df.dropna(inplace=True)
df.drop_duplicates(subset=['LON','LAT'],keep='first',inplace=True)
print(f'After dropping NaNs and dupes\t:\tdf.shape = {df.shape}')
Before dropping NaNs and dupes : df.shape = (838, 3) After dropping NaNs and dupes : df.shape = (823, 3)
df.head()
| LON | LAT | NAME | |
|---|---|---|---|
| 0 | 28.17858 | -25.73882 | 11th Street Taxi Rank |
| 1 | 28.17660 | -25.73795 | 81 Bazaar Street Taxi Rank |
| 2 | 27.83239 | -26.53722 | Adams Road Taxi Rank |
| 3 | 28.12514 | -26.26666 | Alberton City Mall Taxi Rank |
| 4 | 28.10144 | -26.10567 | Alexandra Main Taxi Rank |
X=np.array(df[['LON','LAT']],dtype='float64')
plt.scatter(X[:,0],X[:,1],alpha=0.3,s=50)
<matplotlib.collections.PathCollection at 0x7f88ae991f40>
m=folium.Map(location=[df.LAT.mean(),df.LON.mean()],zoom_start=9,tiles='Stamen Toner')
for _,row in df.iterrows():
folium.CircleMarker(
location=[row.LAT,row.LON],
radius=5,
popup=re.sub(r'[^a-zA-Z ]+','',row.NAME),
color='#1787FE',
fill=True,
fill_color='#1787FE',
).add_to(m)
m
X_blobs, _=make_blobs(n_samples=1000,centers=10,n_features=2,cluster_std=0.5, random_state=4)
plt.scatter(X_blobs[:,0],X_blobs[:,1],alpha=0.2)
<matplotlib.collections.PathCollection at 0x7f88aff43f10>
class_predictions=np.load('sample_clusters.npy')
unique_clusters=np.unique(class_predictions)
for unique_cluster in unique_clusters:
X=X_blobs[class_predictions==unique_cluster]
plt.scatter(X[:,0],X[:,1],alpha=0.2,c=cols[unique_cluster])
silhouette_score(X_blobs,class_predictions)
0.6657220862867241
class_predictions=np.load('sample_clusters_improved.npy')
unique_clusters=np.unique(class_predictions)
for unique_cluster in unique_clusters:
X=X_blobs[class_predictions==unique_cluster]
plt.scatter(X[:,0],X[:,1],alpha=0.2,c=cols[unique_cluster])
silhouette_score(X_blobs,class_predictions)
0.7473587799908298
X_blobs, _ = make_blobs(n_samples=1000, centers=50,
n_features=2, cluster_std=1, random_state=4)
data = defaultdict(dict)
for x in range(1,21):
model = KMeans(n_clusters=3, random_state=17,
max_iter=x, n_init=1).fit(X_blobs)
data[x]['class_predictions'] = model.predict(X_blobs)
data[x]['centroids'] = model.cluster_centers_
data[x]['unique_classes'] = np.unique(class_predictions)
def f(x):
class_predictions = data[x]['class_predictions']
centroids = data[x]['centroids']
unique_classes = data[x]['unique_classes']
for unique_class in unique_classes:
plt.scatter(X_blobs[class_predictions==unique_class][:,0],
X_blobs[class_predictions==unique_class][:,1],
alpha=0.3, c=cols[unique_class])
plt.scatter(centroids[:,0], centroids[:,1], s=200, c='#000000', marker='v')
plt.ylim([-15,15]); plt.xlim([-15,15])
plt.title('How K-Means Clusters')
interactive_plot = interactive(f, x=(1, 20))
output = interactive_plot.children[-1]
output.layout.height = '350px'
interactive_plot
X=np.array(df[['LON','LAT']],dtype='float64')
k=70
model = KMeans(n_clusters=3, random_state=17).fit(X)
class_predictions=model.predict(X)
df[f'CLUSTERS_kmeans{k}']=class_predictions
df.head()
| LON | LAT | NAME | CLUSTERS_kmeans70 | |
|---|---|---|---|---|
| 0 | 28.17858 | -25.73882 | 11th Street Taxi Rank | 2 |
| 1 | 28.17660 | -25.73795 | 81 Bazaar Street Taxi Rank | 2 |
| 2 | 27.83239 | -26.53722 | Adams Road Taxi Rank | 0 |
| 3 | 28.12514 | -26.26666 | Alberton City Mall Taxi Rank | 1 |
| 4 | 28.10144 | -26.10567 | Alexandra Main Taxi Rank | 1 |
def create_map(df,cluster_column):
m = folium.Map(location=[df.LAT.mean(), df.LON.mean()], zoom_start=9, tiles='Stamen Toner')
for _, row in df.iterrows():
# get a colour
cluster_colour = cols[row[cluster_column]]
folium.CircleMarker(
location= [row['LAT'],row['LON']],
radius=5,
popup=row[cluster_column],
color=cluster_colour,
fill=True,
fill_color=cluster_colour
).add_to(m)
return m
m=create_map(df,'CLUSTERS_kmeans70')
print(f'K={k}')
print(f'Silhouette Score: {silhouette_score(X, class_predictions)}')
m.save('kmeans_70.html')
K=70 Silhouette Score: 0.43154470447876553
m
best_silhouette, best_k = -1, 0
for k in tqdm(range(2, 100)):
model = KMeans(n_clusters=k, random_state=1).fit(X)
class_predictions = model.predict(X)
curr_silhouette = silhouette_score(X, class_predictions)
if curr_silhouette > best_silhouette:
best_k = k
best_silhouette = curr_silhouette
print(f'K={best_k}')
print(f'Silhouette Score: {best_silhouette}')
100%|██████████| 98/98 [00:37<00:00, 2.63it/s]
K=98 Silhouette Score: 0.6971995093340411
# code for indexing out certain values
dummy = np.array([-1, -1, -1, 2, 3, 4, 5, -1])
new=np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(dummy)])
model=DBSCAN(eps=0.01,min_samples=5).fit(X)
class_predictions=model.labels_
df['CLUSTERS_DBSCAN']=class_predictions
m=create_map(df,'CLUSTERS_DBSCAN')
print(f'Number of clusters found: {len(np.unique(class_predictions))}')
print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')
print(f'Silhouette ignoring outliers: {silhouette_score(X[class_predictions!=-1], class_predictions[class_predictions!=-1])}')
no_outliers = 0
no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
print(f'Silhouette outliers as singletons: {silhouette_score(X, no_outliers)}')
Number of clusters found: 51 Number of outliers found: 289 Silhouette ignoring outliers: 0.9232138250288208 Silhouette outliers as singletons: 0.5667489350583482
m
model=hdbscan.HDBSCAN(min_cluster_size=5,min_samples=2,cluster_selection_epsilon=0.01)
class_predictions=model.fit_predict(X)
df['CLUSTERS_HDBSCAN']=class_predictions
m=create_map(df,'CLUSTERS_HDBSCAN')
print(f'Number of clusters found: {len(np.unique(class_predictions))-1}')
print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')
print(f'Silhouette ignoring outliers: {silhouette_score(X[class_predictions!=-1], class_predictions[class_predictions!=-1])}')
no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
print(f'Silhouette outliers as singletons: {silhouette_score(X, no_outliers)}')
m
Number of clusters found: 66 Number of outliers found: 102 Silhouette ignoring outliers: 0.7670504356844786 Silhouette outliers as singletons: 0.638992483305273
hdbscan.HDBSCAN?
classifier=KNeighborsClassifier(n_neighbors=1)
df_train=df[df.CLUSTERS_HDBSCAN!=-1]
df_predict=df[df.CLUSTERS_HDBSCAN==-1]
X_train=np.array(df_train[['LON','LAT']],dtype='float64')
y_train=np.array(df_train[['CLUSTERS_HDBSCAN']])
X_predict=np.array(df_predict[['LON','LAT']],dtype='float64')
classifier.fit(X_train,y_train)
<ipython-input-37-c8b679b88806>:1: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). classifier.fit(X_train,y_train)
KNeighborsClassifier(n_neighbors=1)
predictions=classifier.predict(X_predict)
predictions
array([26, 41, 13, 44, 57, 26, 34, 4, 60, 60, 16, 16, 61, 24, 51, 51, 51,
58, 41, 63, 13, 45, 15, 45, 45, 6, 0, 17, 26, 26, 26, 49, 49, 53,
49, 13, 61, 26, 26, 39, 65, 31, 31, 31, 0, 46, 46, 21, 58, 60, 6,
5, 24, 1, 63, 34, 64, 36, 36, 16, 7, 3, 64, 41, 13, 39, 39, 39,
41, 40, 40, 25, 59, 57, 61, 61, 62, 26, 15, 15, 59, 63, 6, 19, 61,
61, 48, 4, 41, 21, 64, 64, 64, 61, 61, 21, 23, 38, 31, 41, 55, 55])
df['CLUSTERS_hybrid']=df['CLUSTERS_HDBSCAN']
df.loc[df.CLUSTERS_HDBSCAN==-1,'CLUSTERS_hybrid']=predictions
m=create_map(df,'CLUSTERS_hybrid')
m
class_predictions=df.CLUSTERS_hybrid
print(f'Number of clusters found: {len(np.unique(class_predictions))}')
print(f'Silhouette: {silhouette_score(X, class_predictions)}')
m.save('hybrid.html')
Number of clusters found: 66 Silhouette: 0.5849126494706486
df['CLUSTERS_hybrid'].value_counts().plot.hist(bins=70,alpha=0.4,label='Hybrid')
df['CLUSTERS_kmeans70'].value_counts().plot.hist(bins=70,alpha=0.4,label='K-Means70')
plt.legend()
plt.title('Comparing Hybrid and K-Means Approaches')
plt.xlabel('Cluster Sizes')
Text(0.5, 0, 'Cluster Sizes')
For some additional reading, feel free to check out K-Means, DBSCAN, and HDBSCAN clustering respectively.
It may be of use to also check out other forms of clustering that are commonly used and available in the scikit-learn library. HDBSCAN documentation also includes a good methodology for choosing your clustering algorithm based on your dataset and other limiting factors.